{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
      "/home/roph/anaconda3/envs/dask2/lib/python3.6/site-packages/ipykernel_launcher.py:141: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.\n"
     ]
    }
   ],
   "source": [
    "#########################\n",
    "### Import libraries ####\n",
    "#########################\n",
    "\n",
    "\n",
    "from scripts_for_figure_2_and_table_2 import *\n",
    "\n",
    "\n",
    "###############################\n",
    "### Read in example tweets ####\n",
    "###############################\n",
    "\n",
    "# Example tweets have been paraphrased to preserve anonymity\n",
    "\n",
    "example_tweets = pd.read_excel(\n",
    "     \"paraphrase.xlsx\",\n",
    "     engine='openpyxl',\n",
    ")\n",
    "\n",
    "\n",
    "#####################################################\n",
    "### Score tweets using super-unsupervised method ####\n",
    "#####################################################\n",
    "\n",
    "\n",
    "#remove mentions and hashtags\n",
    "example_tweets[\"full_text_temp\"] = example_tweets[\"paraphrase\"].str.replace(\" # \", \" #\").astype(\"str\")    \n",
    "example_tweets[\"full_text_filtered\"] = example_tweets[\"full_text_temp\"].apply(lambda y: mentionFinder.sub(\"\",y))              \n",
    "                                                                  \n",
    "#tokenize using spacy. By disabling the tagger, parser, and ner the processing is faster.\n",
    "docs = list(nlp.pipe(example_tweets[\"full_text_filtered\"], disable = [\"tagger\", \"parser\", \"ner\"]))\n",
    "\n",
    "#set up list to keep results from tokenization\n",
    "tokens_list = []\n",
    "\n",
    "#iterate over documents and tokenize. More preprocessing can be added or removed\n",
    "for token in docs:    \n",
    "    result_temp = [w.lemma_.lower() for w in token #change to lowercase\n",
    "                                                                if w.is_stop !=True #remove stop words \n",
    "                                                                and w.is_punct != True #remove punctuation\n",
    "                                                                and w.like_url!=True  #remove urls\n",
    "                                                                and w.like_num!=True #remove numbers                                                                \n",
    "                                                                and w.is_space!=True #remove space                    \n",
    "                                                                ]\n",
    "    tokens_list.append(result_temp)\n",
    "\n",
    "#set of list to save results for averaging\n",
    "results_all = []\n",
    "index_all = []\n",
    "\n",
    "#iterate over tokens\n",
    "for index,tweet in enumerate(tokens_list):\n",
    "    \n",
    "    try:\n",
    "        #get vector representations for each token and create an average\n",
    "        temp_vector = [model[x] for x in tweet if x in model.words]\n",
    "\n",
    "        if len(temp_vector)>0:\n",
    "            mean_vector = np.mean(temp_vector, axis = 0)\n",
    "            result = vector_normalize(mean_vector)\n",
    "            results_all.append(result)\n",
    "            index_all.append(index)\n",
    "        else:\n",
    "            results_all.append(np.nan)\n",
    "            index_all.append(index)\n",
    "    except:\n",
    "        results_all.append(np.nan)\n",
    "        index_all.append(index)\n",
    "\n",
    "#combine into dataframe\n",
    "df_mean = pd.DataFrame({\"mean_vector\":results_all, \"id\":index_all})   \n",
    "\n",
    "# calculate distances \n",
    "political = vector_normalize(model[\"political\"])\n",
    "hate = vector_normalize(model[\"hate\"])\n",
    "combined = vector_normalize(model[\"political\"]+model[\"hate\"])\n",
    "\n",
    "# add text and index to mean scores\n",
    "example_tweets[\"id\"] = example_tweets.index\n",
    "df_mean = df_mean.merge(example_tweets, on = \"id\")\n",
    "\n",
    "#remove empty\n",
    "df_mean_reduced = df_mean[~df_mean[\"mean_vector\"].isna()]\n",
    "\n",
    "#calculate distances\n",
    "df_mean_reduced[\"political\"] = df_mean_reduced[\"mean_vector\"].apply(lambda x: distance_calculation(political, x))\n",
    "df_mean_reduced[\"hate\"] = df_mean_reduced[\"mean_vector\"].apply(lambda x: distance_calculation(hate, x))\n",
    "df_mean_reduced[\"political_hate\"] = df_mean_reduced[\"mean_vector\"].apply(lambda x: distance_calculation(combined, x))\n",
    "\n",
    "\n",
    "###########################################\n",
    "### Score tweets using toxicity scores ####\n",
    "###########################################\n",
    "\n",
    "#set up api key and url\n",
    "url = ('https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze' +    \n",
    "    '?key=' + api_key) #create api key from google to get access\n",
    "\n",
    "#extract api scores from google's api\n",
    "results = []\n",
    "for index,content in enumerate(df_mean_reduced[\"paraphrase\"]):\n",
    "    if index % 10 == 0: \n",
    "        time.sleep(1)    \n",
    "    try:\n",
    "        output = extract_from_perspective_api(content,index)\n",
    "        results.insert(index,output)\n",
    "    except:\n",
    "        results.append(np.nan)\n",
    "\n",
    "#extract result from json files\n",
    "all_results = []\n",
    "for index, x in enumerate(results):    \n",
    "    try:\n",
    "        \n",
    "        temp = json.loads(x[0])\n",
    "\n",
    "        \n",
    "        all_results.append(temp[\"attributeScores\"][\"TOXICITY\"][\"summaryScore\"][\"value\"])\n",
    "    except:\n",
    "        all_results.append(np.nan)\n",
    "        \n",
    "#append to dataframe with super-unsupervised scores\n",
    "df_mean_reduced[\"toxicity\"] = all_results\n",
    "\n",
    "#remove those without scores and save to csv\n",
    "df_mean_reduced[df_mean_reduced[\"paraphrase\"].isna()==False].to_csv(\"graph_examples.csv\")\n",
    "\n",
    "\n",
    "#########################\n",
    "### Save to tex file ####\n",
    "#########################\n",
    "\n",
    "example_tweets_with_scores = df_mean_reduced[df_mean_reduced[\"paraphrase\"].isna()==False]\n",
    "example_tweets_with_scores = example_tweets_with_scores[[\"id\",\"paraphrase\"]]\n",
    "example_tweets_with_scores.columns = [\"Number\",\"Tweet\"]\n",
    "stuff_to_example_table = example_tweets_with_scores[example_tweets_with_scores[\"Number\"].isin([12,13,36,33,11,8,27,23])]\n",
    "stuff_to_example_table = stuff_to_example_table.replace(\"\\n\",\"\\newline\", regex = True)\n",
    "stuff_to_example_table[\"Tweet\"] = stuff_to_example_table[\"Tweet\"].str.replace(\"&amp;\",\" \")\n",
    "stuff_to_example_table\n",
    "pd.options.display.max_colwidth =  -1\n",
    "\n",
    "#write to latex\n",
    "stuff_to_example_table.to_latex(\"example_tweets_graph.tex\", column_format = 'p{1cm} p{10cm}', index = False, escape = False)\n",
    "\n",
    "#add midlines\n",
    "with open(\"example_tweets_graph.tex\", \"r\") as r:\n",
    "    hest = r.readlines()\n",
    "    \n",
    "temp = \"\".join(hest).replace(\"\\\\\\\\\\n \", \"\\\\\\\\\\n\\\\midrule\\n \")\n",
    "with open(\"example_tweets_graph.tex\", \"w\") as f:\n",
    "    f.write(temp)\n",
    "    \n",
    "    \n",
    "    \n",
    "######################\n",
    "### Create R plot ####\n",
    "######################\n",
    "\n",
    "example_tweet_plot(file = \"graph_examples.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dask2",
   "language": "python",
   "name": "dask2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
